knitr::opts_chunk$set(echo = T, root.dir = here::here())
# knitr::opts_knit$set(echo = T, root.dir = here::here())

library(dplyr)
library(ggplot2)
library(googledrive)
library(patchwork)
set.seed(2019)

Merge all cell-type and tissue enrichment datasets for meta-analysis.

Merge cell-types

Download

# Table 1
googledrive::drive_download("https://docs.google.com/spreadsheets/d/18bKV3mUuyIv1yPN7gps9HPmhROb4jz0x7KxtA8mf0iM/edit#gid=743954199", path = here::here("data/metaanalysis/TableS1.xlsx"), overwrite = T)
## ! Using an auto-discovered, cached token.
##   To suppress this message, modify your code or options to clearly consent to
##   the use of a cached token.
##   See gargle's "Non-interactive auth" vignette for more details:
##   <https://gargle.r-lib.org/articles/non-interactive-auth.html>
## ℹ The googledrive package is using a cached token for
##   'brian_schilder@alumni.brown.edu'.
## File downloaded:
## • 'TableS1' <id: 18bKV3mUuyIv1yPN7gps9HPmhROb4jz0x7KxtA8mf0iM>
## Saved locally as:
## • '/Desktop/PD_omics_review/data/metaanalysis/TableS1.xlsx'
# Table 2
googledrive::drive_download("https://docs.google.com/spreadsheets/d/19jz9l2P7W2f1PWT9t3x0L8h6VQL021RdRI7D4FAOjX0/edit#gid=314686952", path = here::here("data/metaanalysis/TableS2.xlsx"), overwrite = T)
## File downloaded:
## • 'TableS2' <id: 19jz9l2P7W2f1PWT9t3x0L8h6VQL021RdRI7D4FAOjX0>
## Saved locally as:
## • '/Desktop/PD_omics_review/data/metaanalysis/TableS2.xlsx'
sheets <- readxl::excel_sheets(here::here("data/metaanalysis/TableS1.xlsx")) 
celltype_sheets <- sheets[endsWith(sheets,"celltypes")]
 
celltype_data <- lapply(celltype_sheets, function(sheet){
  print(sheet)
  dat <- readxl::read_excel(here::here("data/metaanalysis/TableS1.xlsx"), sheet = sheet)
  if(sheet=="Nalls2019_celltypes") dat <- subset(dat, Source!="step1_2_summary.txt" ) # & step3==1
  dat$Sheet <- gsub("_celltypes","",sheet)
  cols <- c("Cell_Type","Sig","Reference","Source","Dataset")  
  cols <- cols[cols %in% colnames(dat)]
  return(dplyr::relocate(dat, all_of(cols), .before = 1))
}) %>% `names<-`(celltype_sheets) %>%
  data.table::rbindlist(fill = T)
## [1] "Bryois2020_celltypes"
## [1] "Reynolds2019_celltypes"
## [1] "Corces2020_celltypes"
## [1] "Schilder2020_celltypes"
## [1] "Li2021_celltypes"
## [1] "Nalls2019_celltypes"
## New names:
## * `` -> ...17
## [1] "Agarwal2020_celltypes"
## [1] "Nott2019_celltypes"
## [1] "Coetzee2016_celltypes"
## [1] "Li2019_celltypes"

Preprocess

FDR plots

plot_fdr <- function(dat,
                     x="Score",
                     y="-log10(FDR)",
                     color="Sheet",
                     size=y,
                     label=NULL,
                     show_plot=FALSE){
  gg <- ggplot(dat, 
              aes_string(x=x, y=y, color=color, size=size, 
                  label=label)) + 
  geom_point(alpha=.5) + 
  theme_bw()
  if(show_plot) print(gg)
  return(gg)
}
plot_celltypes <- celltype_data %>%    
  subset(!is.na(Sig) & !is.na(Cell_Type) & !( Cell_Type %in% c("brain"))) %>%
  dplyr::mutate( Sig_neglog =  -log1p(Sig)) %>% 
  dplyr::group_by(Reference, .drop = F) %>%
  dplyr::mutate(Cell_Type= paste0(Cell_Type,"@",group_indices())) %>%
  ##  Calculate zscore for fine-mapping overlap   
  dplyr::summarise(Cell_Type,Source,Sheet,Dataset,Sig,Count, 
                   FDR = p.adjust(p = Sig, method = "fdr"),
                   bonf = p.adjust(p = Sig, method = "bonf"),
                   n_tests = n()) %>% 
  dplyr::mutate(Score = ifelse(Reference=="Schilder et al, 2020 (bioRxiv)", 
                               as.numeric(cut(Count, 10))/10, 
                               scales::rescale(-log10(FDR), c(0.0000000001,1))) ) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(FDR_all = p.adjust(p = Sig, method = "fdr")) %>%
  subset(FDR_all<.05 & FDR<.05 & Sig < 0.05  & (!is.infinite(Score)))   
## Warning: `group_indices()` was deprecated in dplyr 1.0.0.
## Please use `cur_group_id()` instead.
## `summarise()` has grouped output by 'Reference'. You can override using the `.groups` argument.
# Sort by medians 
plot_celltypes <- plot_celltypes %>% 
  dplyr::group_by(Reference, Cell_Type) %>%
  dplyr::summarise(mean_Score = mean(Score, na.rm=T),
                   median_Score = median(Score, na.rm=T),
                   .groups = "keep") %>%
  merge(plot_celltypes, by=c("Reference","Cell_Type"))  %>%
  dplyr::arrange(Reference, desc(median_Score), bonf) 
plot_celltypes$Cell_Type <- factor(plot_celltypes$Cell_Type, 
                                   levels = rev(unique(plot_celltypes$Cell_Type)), ordered = T)
                 
gg_FDR1 <- gg_FDR2 <- plot_fdr(dat = plot_celltypes, y="FDR", label = "Cell_Type")
plotly::ggplotly(gg_FDR1)
gg_FDR2 <- plot_fdr(dat = plot_celltypes, label = "Cell_Type")
plotly::ggplotly(gg_FDR2)

Celltype counts

celltype_counts <- plot_celltypes %>% dplyr::group_by(Reference) %>% 
  dplyr::summarise(n_celltypes=dplyr::n_distinct(Cell_Type)) %>%
  dplyr::arrange(desc(n_celltypes))
knitr::kable(celltype_counts)
Reference n_celltypes
Nalls et al, 2019 (Lancet Neurology) 116
Schilder et al, 2020 (bioRxiv) 8
Coetzee et al, 2016 (Scientific Reports) 7
Reynolds et al. 2019 (npj Parkinson’s Disease) 4
Bryois et al, 2020 (Nature Genetics) 3
Li et al, 2019 (Nature Communications) 3
Nott et al, 2019 (Science) 3
Agarwal et al, 2020 (Nature Communications) 2

Summary plot

Summarise the top celltypes/tissues per study according to abstracted “Score” value (computed here).

### Limit the number of celltypes/study for visualization purposes
max_celltypes <- 25

celltype_plot <- ggplot(plot_celltypes %>% 
                          dplyr::group_by(Reference) %>% 
                          dplyr::slice_max(order_by = median_Score, n = max_celltypes),
       aes(x=Score, y=Cell_Type, fill=Reference)) + 
  # geom_bar(show.legend = F, stat="identity") +
  geom_boxplot(show.legend = F) +
  # geom_violin(show.legend = F) +
  geom_point(show.legend = F, alpha=.2) + 
  # Create a dicionary mapping the new labels
  scale_y_discrete(labels = setNames(gsub("@.*","",levels(plot_celltypes$Cell_Type )),
                                     levels(plot_celltypes$Cell_Type) ) ) + 
  # facet_wrap(facets = .~paste(gsub("[(]","\n(",Reference),"\n\n n tests =",n_tests),
  #          scales = "free", drop = F, nrow = 1) +  
  facet_grid(facets = paste0(gsub("[(]","\n(",Sheet),"\n (tests = ",n_tests,")")~.,
           scales = "free", space="free", drop = F) + 
  theme_bw() +
  theme(strip.background = element_rect(fill = "black"), 
        strip.text = element_text(color = "white", angle = 0),
        strip.text.y = element_text(color = "white", angle = 0),
         panel.grid.minor.x = element_blank())

print(celltype_plot)

# ggsave(here::here("plots/metaanalysis/celltype_metaanalysis.pdf"),celltype_plot, dpi = 300, height = 15)

Merge tissues

Download

tissue_sheets <- sheets[endsWith(sheets,"tissues")]
 
tissue_data <- lapply(tissue_sheets, function(sheet){
  print(sheet)
  dat <- readxl::read_excel(here::here("data/metaanalysis/TableS1.xlsx"), sheet = sheet)
  dat$Sheet <- gsub("_tissues","",sheet)
  cols <- c("Tissue","Sig","Reference","Source","Dataset")  
  cols <- cols[cols %in% colnames(dat)]
  return(dplyr::relocate(dat, all_of(cols), .before = 1))
}) %>% `names<-`(tissue_sheets) %>%
  data.table::rbindlist(fill = T)
## [1] "Bryois2020_tissues"
## [1] "Reynolds2019_tissues"
## [1] "Corces2020_tissues"
## [1] "Schilder2020_tissues"
## [1] "Nalls2019_tissues"
## [1] "Agarwal2020_tissues"
## [1] "Nott2019_tissues"
## [1] "Coetzee2016_tissues"
## [1] "Li2019_tissues"

Preprocess

FDR plots

plot_tissues <- tissue_data %>%   
  subset(!(Tissue %in% c(NA,"NA")) & (!is.na(Sig))) %>%
  dplyr::mutate( Sig_neglog =  -log1p(Sig)) %>% 
  ##  Calculate zscore for fine-mapping overlap  
  dplyr::group_by(Reference, .drop = F) %>%
  dplyr::mutate(Tissue= paste0(Tissue,"@",group_indices())) %>%
  dplyr::summarise(Tissue,Source,Sheet,Dataset,Sig,PP.H4,  
                   FDR = p.adjust(p = Sig, method = "fdr"),  
                   bonf = p.adjust(p = Sig, method = "bonf"), 
                   n_tests = n(),
                   .groups = "keep") %>% 
  # dplyr::ungroup() %>%
  dplyr::mutate(Score = ifelse(Reference=="Schilder et al, 2020 (bioRxiv)", 
                               as.numeric(cut(PP.H4, 10))/10, 
                               scales::rescale(-log(FDR), c(0.0000000001,1))) ) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(FDR_all = p.adjust(p = Sig, method = "fdr")) %>%
  subset((FDR_all<.05 & FDR<.05 & Sig < 0.05) | (PP.H4>.8)) 
# Sort by medians 
plot_tissues <- plot_tissues %>%  
  dplyr::group_by(Reference, Tissue, .drop = F) %>% 
  dplyr::summarise(mean_Score = mean(Score, na.rm=T),
                   median_Score = median(Score, na.rm=T),
                   n_tissues = n_distinct(Tissue),
                   .groups = "keep") %>%
  merge(plot_tissues, by=c("Reference","Tissue"), all.y = T)  %>%
  dplyr::arrange(Reference, desc(median_Score), bonf) 
plot_tissues$Tissue <- factor(plot_tissues$Tissue, 
                                   levels = rev(unique(plot_tissues$Tissue)), ordered = T)
  
 
gg_FDR1 <- plot_fdr(dat = plot_tissues, y="FDR", label = "Tissue")
plotly::ggplotly(gg_FDR1)
gg_FDR2 <- plot_fdr(dat = plot_tissues, label = "Tissue")
plotly::ggplotly(gg_FDR2) 

Tissue counts

plot_tissues %>% dplyr::group_by(Reference) %>% 
  dplyr::summarise(n_tissues=dplyr::n_distinct(Tissue)) %>%
  dplyr::arrange(desc(n_tissues))
## # A tibble: 7 × 2
##   Reference                                      n_tissues
##   <chr>                                              <int>
## 1 Reynolds et al. 2019 (npj Parkinson's Disease)        29
## 2 Nalls et al, 2019 (Lancet Neurology)                  15
## 3 Coetzee et al, 2016 (Scientific Reports)               6
## 4 Bryois et al, 2020 (Nature Genetics)                   2
## 5 Agarwal et al, 2020 (Nature Communications)            1
## 6 Li et al, 2019 (Nature Communications)                 1
## 7 Nott et al, 2019 (Science)                             1

Summary plot

# plot_tissues[is.na(plot_tissues$Dataset),"Dataset"] <- plot_tissues[is.na(plot_tissues$Dataset),"Sheet"]
tissue_plot <- ggplot(plot_tissues, 
       aes(x=Score, y=Tissue, fill=Reference)) + 
  # geom_bar(show.legend = F, stat="identity") +
  geom_boxplot(show.legend = F) +
  # geom_violin(show.legend = F) +
  geom_point(show.legend = F, alpha=.2) +
  # facet_wrap(facets = .~paste(gsub("[(]","\n(",Reference),"\n\n n tests =",n_tests),
  #          scales = "free", drop = F, nrow = 1) +  
  facet_grid(facets = paste0(gsub("[(]","\n(",Sheet),"\n (tests = ",n_tests,")")~.,
           scales = "free", space="free", drop = F) + 
  scale_x_continuous(limits = c(0,1)) +
  # Create a dicionary mapping the new labels
  scale_y_discrete(labels = setNames(gsub("@.*","",levels(plot_tissues$Tissue)),
                                     levels(plot_tissues$Tissue) ) ) + 
  theme_bw() +
  theme(strip.background = element_rect(fill = "black"), 
        strip.text = element_text(color = "white", angle = 0),
        strip.text.y = element_text(color = "white", angle = 0), 
        panel.grid.minor.x = element_blank())

print(tissue_plot)

# ggsave("plots/metaanalysis/tissue_metaanalysis.pdf",tissue_plot, dpi = 300, height = 15)

Fuse plots

fused_plot <- (celltype_plot + labs(title="Cell-types") + 
                 tissue_plot + labs(title="Tissues")) +
  patchwork::plot_annotation(tag_levels = letters)
print(fused_plot)
## Warning: Removed 27 rows containing non-finite values (stat_boxplot).

# ggsave(here::here("plots/metaanalysis/metaanalysis.pdf"),fused_plot, height = 14, width = 15, dpi = 300)
# ggsave(here::here("plots/metaanalysis/metaanalysis.jpg"),fused_plot, height = 14, width = 15, dpi = 300)

Session Info

utils::sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] patchwork_1.1.1   googledrive_2.0.0 ggplot2_3.3.5     dplyr_1.0.7      
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.1.1  xfun_0.25         bslib_0.2.5.1     purrr_0.3.4      
##  [5] gargle_1.2.0      colorspace_2.0-2  vctrs_0.3.8       generics_0.1.0   
##  [9] viridisLite_0.4.0 htmltools_0.5.1.1 yaml_2.2.1        utf8_1.2.2       
## [13] plotly_4.9.4.9000 rlang_0.4.11      jquerylib_0.1.4   pillar_1.6.2     
## [17] glue_1.4.2        withr_2.4.2       DBI_1.1.1         rappdirs_0.3.3   
## [21] readxl_1.3.1      lifecycle_1.0.0   stringr_1.4.0     munsell_0.5.0    
## [25] gtable_0.3.0      cellranger_1.1.0  htmlwidgets_1.5.3 evaluate_0.14    
## [29] labeling_0.4.2    knitr_1.33        crosstalk_1.1.1   curl_4.3.2       
## [33] fansi_0.5.0       highr_0.9         Rcpp_1.0.7        openssl_1.4.4    
## [37] scales_1.1.1      jsonlite_1.7.2    farver_2.1.0      fs_1.5.0         
## [41] askpass_1.1       digest_0.6.27     stringi_1.7.3     grid_4.1.0       
## [45] rprojroot_2.0.2   here_1.0.1        cli_3.0.1         tools_4.1.0      
## [49] magrittr_2.0.1    sass_0.4.0        lazyeval_0.2.2    tibble_3.1.3     
## [53] tidyr_1.1.3       crayon_1.4.1      pkgconfig_2.0.3   ellipsis_0.3.2   
## [57] data.table_1.14.0 assertthat_0.2.1  rmarkdown_2.10    httr_1.4.2       
## [61] rstudioapi_0.13   R6_2.5.0          compiler_4.1.0